There have four different functions.
In [74]:
import gzip
import pickle
from os import path
from collections import defaultdict
from numpy import sign
"""
Load buzz data as a dictionary.
You can give parameter for data so that you will get what you need only.
"""
def load_buzz(root='../data', data=['train', 'test', 'questions'], format='pklz'):
buzz_data = {}
for ii in data:
file_path = path.join(root, ii + "." + format)
with gzip.open(file_path, "rb") as fp:
buzz_data[ii] = pickle.load(fp)
return buzz_data
In [75]:
from numpy import sign, abs
def _feat_basic(bd, group):
X = []
for item in bd[group].items():
qid = item[1]['qid']
q = bd['questions'][qid]
item[1]['q_length'] = max(q['pos_token'].keys())
item[1]['category'] = q['category'].lower()
item[1]['answer'] = q['answer'].lower()
X.append(item[1])
return X
def _feat_sign_val(data):
for item in data:
item['sign_val'] = sign(item['position'])
def _get_avg_pos(data, sign_val=None):
unwanted_index = []
pos_uid = defaultdict(list)
pos_qid = defaultdict(list)
for index, key in enumerate(data):
if sign_val and sign(data[key]['position']) != sign_val:
unwanted_index.append(index)
else:
pos_uid[data[key]['uid']].append(data[key]['position'])
pos_qid[data[key]['qid']].append(data[key]['position'])
avg_pos_uid = {}
avg_pos_qid = {}
if not sign_val:
sign_val = 1
for key in pos_uid:
pos = abs(pos_uid[key])
avg_pos_uid[key] = sign_val * (sum(pos) / len(pos))
for key in pos_qid:
pos = abs(pos_qid[key])
avg_pos_qid[key] = sign_val * (sum(pos) / len(pos))
return avg_pos_uid, avg_pos_qid, unwanted_index
def _feat_avg_pos(data, bd, group, sign_val):
avg_pos_uid, avg_pos_qid, unwanted_index = _get_avg_pos(bd['train'], sign_val=sign_val)
if group == 'train':
for index in sorted(unwanted_index, reverse=True):
del data[index]
for item in data:
if item['uid'] in avg_pos_uid:
item['avg_pos_uid'] = avg_pos_uid[item['uid']]
else:
vals = avg_pos_uid.values()
item['avg_pos_uid'] = sum(vals) / float(len(vals))
if item['qid'] in avg_pos_qid:
item['avg_pos_qid'] = avg_pos_qid[item['qid']]
else:
vals = avg_pos_qid.values()
item['avg_pos_qid'] = sum(vals) / float(len(vals))
# Response position can be longer than length of question
if item['avg_pos_uid'] > item['q_length']:
item['avg_pos_uid'] = item['q_length']
if item['avg_pos_qid'] > item['q_length']:
item['avg_pos_qid'] = item['q_length']
In [76]:
def featurize(bd, group, sign_val=None, extra=None):
# Basic features
# qid(string), uid(string), position(float)
# answer'(string), 'potistion'(float), 'qid'(string), 'uid'(string)
X = _feat_basic(bd, group=group)
# Some extra features
if extra:
for func_name in extra:
func_name = '_feat_' + func_name
if func_name in ['_feat_avg_pos']:
globals()[func_name](X, bd, group=group, sign_val=sign_val)
else:
globals()[func_name](X)
if group == 'train':
y = []
for item in X:
y.append(item['position'])
del item['position']
return X, y
elif group == 'test':
return X
else:
raise ValueError(group, 'is not the proper type')
In [77]:
import csv
def select(data, keys):
unwanted = data[0].keys() - keys
for item in data:
for unwanted_key in unwanted:
del item[unwanted_key]
return data
def write_result(test_set, predictions, file_name='guess.csv'):
predictions = sorted([[id, predictions[index]] for index, id in enumerate(test_set.keys())])
predictions.insert(0,["id", "position"])
with open(file_name, "w") as fp:
writer = csv.writer(fp, delimiter=',')
writer.writerows(predictions)
In [78]:
import multiprocessing
from sklearn import linear_model
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction import DictVectorizer
import math
from numpy import abs, sqrt
regression_keys = ['category', 'q_length', 'qid', 'uid', 'answer', 'avg_pos_qid', 'avg_pos_pid']
X_train, y_train = featurize(load_buzz(), group='train', sign_val=None, extra=['sign_val', 'avg_pos'])
X_train = select(X_train, regression_keys)
vec = DictVectorizer()
X_train = vec.fit_transform(X_train)
regressor_names = """
LinearRegression
Ridge
Lasso
ElasticNet
"""
print ("=== Linear Cross validation RMSE scores:")
for regressor in regressor_names.split():
scores = cross_val_score(getattr(linear_model, regressor)(),
X_train, y_train,
cv=10,
scoring='mean_squared_error',
n_jobs=multiprocessing.cpu_count()-1
)
print (regressor, sqrt(abs(scores)).mean())
In [79]:
regression_keys = ['category', 'q_length', 'qid', 'uid', 'answer', 'avg_pos_qid', 'avg_pos_pid']
X_train, y_train = featurize(load_buzz(), group='train', sign_val=None, extra=['avg_pos'])
X_train = select(X_train, regression_keys)
X_test = featurize(load_buzz(), group='test', sign_val=None, extra=['avg_pos'])
X_test = select(X_test, regression_keys)
vec = DictVectorizer()
vec.fit(X_train + X_test)
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)
In [80]:
regressor = linear_model.LassoCV()
regressor.fit(X_train, y_train)
Out[80]:
In [81]:
print(regressor.coef_)
print(regressor.alpha_)
In [82]:
predictions = regressor.predict(X_test)
In [83]:
write_result(bd['test'], predictions)
This submissions scores 84.32000 in Kaggle.